Filter criteria:
Filter differentially expressed genes between autism and control (p-value < 0.05)
No samples are removed based on network connectivity z-scores
glue('Number of genes: ', nrow(datExpr), '\n',
'Number of samples: ', ncol(datExpr), ' (', sum(datMeta$Diagnosis_=='ASD'), ' ASD, ',
sum(datMeta$Diagnosis_!='ASD'), ' controls)')
## Number of genes: 2928
## Number of samples: 86 (51 ASD, 35 controls)
First principal component explains over 89.7% of the total variance (lower than 90%…)
reduce_dim_datExpr = function(datExpr, datMeta, var_explained=0.98){
datExpr = data.frame(datExpr)
datExpr_pca = prcomp(datExpr, scale.=TRUE)
last_pc = data.frame(summary(datExpr_pca)$importance[3,]) %>% rownames_to_column(var='ID') %>%
filter(.[[2]] >= var_explained) %>% top_n(-1, ID)
par(mfrow=c(1,2))
plot(summary(datExpr_pca)$importance[2,], type='b')
abline(v=substr(last_pc$ID, 3, nchar(last_pc$ID)), col='blue')
plot(summary(datExpr_pca)$importance[3,], type='b')
abline(h=var_explained, col='blue')
print(glue('Keeping top ', substr(last_pc$ID, 3, nchar(last_pc$ID)), ' components that explain ',
var_explained*100, '% of the variance'))
datExpr_top_pc = datExpr_pca$x %>% data.frame %>% dplyr::select(PC1:last_pc$ID)
return(list('datExpr'=datExpr_top_pc, 'pca_output'=datExpr_pca))
}
reduce_dim_output = reduce_dim_datExpr(datExpr, datMeta)
## Keeping top 11 components that explain 98% of the variance
datExpr_redDim = reduce_dim_output$datExpr
pca_output = reduce_dim_output$pca_output
rm(datSeq, datProbes, reduce_dim_datExpr, reduce_dim_output, datExpr)
Perhaps the first component is determined by a small number of samples and by removing them we can lower the variance explained by it:
plot(sort(pca_output$rotation[,1], decreasing = TRUE))
Genes are still separated into two clouds of points:
datExpr_redDim %>% ggplot(aes(x=PC1, y=PC2)) + geom_point() + theme_minimal()
Projecting all the original points into the space created by the two principal components and colouring by the differential expression p-value we can see that the points in the middle of the two clouds were filtered out because their DE wasn’t statistically significant. Colouring by their log2 fold change we can see that the genes from the cloud on the top are overexpressed and the genes in the bottom one underexpressed.
load('./../working_data/RNAseq_ASD_4region_normalized.Rdata')
# Remove Subject with ID = 'AN03345'
keep = datMeta$Subject_ID!='AN03345'
datMeta = datMeta[keep,]
datExpr = datExpr[,keep]
# Calculate differential expression p-value of each gene
mod = model.matrix(~ Dx, data=datMeta)
corfit = duplicateCorrelation(datExpr, mod, block=datMeta$Subject_ID)
lmfit = lmFit(datExpr, design=mod, block=datMeta$Subject_ID, correlation=corfit$consensus)
fit = eBayes(lmfit, trend=T, robust=T)
top_genes = topTable(fit, coef=2, number=nrow(datExpr))
ordered_top_genes = top_genes[match(rownames(datExpr), rownames(top_genes)),]
ASD_pvals = ordered_top_genes$adj.P.Val
log_fold_change = ordered_top_genes$logFC
pca_data_projection = scale(datExpr) %*% pca_output$rotation %>% data.frame
p1 = pca_data_projection %>% ggplot(aes(x=PC1, y=PC2, color=ASD_pvals)) + geom_point(alpha=0.5) + theme_minimal()
p2 = pca_data_projection %>% ggplot(aes(x=PC1, y=PC2, color=log_fold_change)) + geom_point() + theme_minimal() +
scale_colour_gradient2()
grid.arrange(p1, p2, ncol=2)
rm(keep, mod, corfit, lmfit, fit, ASD_pvals, log_fold_change, top_genes, ordered_top_genes, datExpr, datProbes,
datSeq, p1, p2)
Only 165 of our 2928 genes appear on the SFARI list, losing all 25 genes with a score of 1
SFARI_genes = read_csv('./../working_data/SFARI_genes_01-15-2019.csv',
col_types = cols(`number-of-reports` = col_integer(), syndromic = col_logical()))
print('Gene Score count of all genes:')
## [1] "Gene Score count of all genes:"
SFARI_genes$`gene-score` %>% table
## .
## 1 2 3 4 5 6
## 25 62 195 454 175 25
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL',
dataset='hsapiens_gene_ensembl',
host='feb2014.archive.ensembl.org') ## Gencode v19
gene_names = getBM(attributes=c('ensembl_gene_id', 'hgnc_symbol'), filters=c('hgnc_symbol'),
values=SFARI_genes$`gene-symbol`, mart=mart) %>% mutate('gene-symbol'=hgnc_symbol) %>%
dplyr::select('ensembl_gene_id', 'gene-symbol')
SFARI_genes = left_join(SFARI_genes, gene_names, by='gene-symbol')
gene_scores = SFARI_genes %>% dplyr::select(ensembl_gene_id, `gene-score`, syndromic) %>%
right_join(gene_names, by='ensembl_gene_id') %>% dplyr::select(-'gene-symbol')
# Full (ordered) list of datExpr genes with their scores
gene_scores = data.frame('ensembl_gene_id'=rownames(datExpr_redDim)) %>%
left_join(gene_scores, by = 'ensembl_gene_id') %>%
mutate('syndromic' = ifelse(syndromic==FALSE | is.na(syndromic), FALSE, TRUE),
'gene-bool' = ifelse(is.na(`gene-score`), FALSE, TRUE))
'Gene score count of genes in datExpr:'
## [1] "Gene score count of genes in datExpr:"
gene_scores$`gene-score` %>% table
## .
## 2 3 4 5 6
## 5 31 66 40 4
rm(SFARI_genes, mart, gene_names)
clusterings = list()
clusterings[['SFARI_score']] = gene_scores$`gene-score`
names(clusterings[['SFARI_score']]) = gene_scores$ensembl_gene_id
clusterings[['SFARI_bool']] = gene_scores$`gene-bool`
names(clusterings[['SFARI_bool']]) = gene_scores$ensembl_gene_id
clusterings[['syndromic']] = gene_scores$syndromic
names(clusterings[['syndromic']]) = gene_scores$ensembl_gene_id
set.seed(123)
wss = sapply(1:10, function(k) kmeans(datExpr_redDim, k, iter.max=100, nstart=25,
algorithm='MacQueen')$tot.withinss)
plot(wss, type='b', main='K-Means Clustering')
best_k = 3
abline(v = best_k, col='blue')
datExpr_k_means = kmeans(datExpr_redDim, best_k, iter.max=100, nstart=25)
clusterings[['km']] = datExpr_k_means$cluster
Chose k=6 as best number of clusters.
Clusters seem to be able to separate ASD and control samples pretty well and there are no noticeable patterns regarding sex, age or brain region in any cluster.
Younger ASD samples seem to be more similar to control samples than older ASD samples (pink cluster has most of the youngest samples). The yellow cluster is made of young ASD samples.
h_clusts = datExpr_redDim %>% dist %>% hclust
# plot(h_clusts, hang = -1, cex = 0.6, labels=FALSE)
best_k = 6
clusterings[['hc']] = cutree(h_clusts, best_k)
create_viridis_dict = function(){
min_score = clusterings[['SFARI_score']] %>% min(na.rm=TRUE)
max_score = clusterings[['SFARI_score']] %>% max(na.rm=TRUE)
viridis_score_cols = viridis(max_score - min_score + 1)
names(viridis_score_cols) = seq(min_score, max_score)
return(viridis_score_cols)
}
viridis_score_cols = create_viridis_dict()
dend_meta = gene_scores[match(labels(h_clusts), gene_scores$ensembl_gene_id),] %>%
mutate('SFARI_score' = viridis_score_cols[`gene-score`], # Purple: 2, Yellow: 6
'SFARI_bool' = ifelse(`gene-bool` == T, '#21908CFF', 'white'), # Acqua
'Syndromic' = ifelse(syndromic == T, 'orange', 'white')) %>%
dplyr::select(SFARI_score, SFARI_bool, Syndromic)
h_clusts %>% as.dendrogram %>% set('labels', rep('', nrow(datMeta))) %>%
set('branches_k_color', k=best_k) %>% plot
colored_bars(colors=dend_meta)
Samples are grouped into two big clusters and two outliers, the first big cluster has three main subclusters, two small subclusters and two outliers, and the second one two main subclustersand five small subclusters.
*Output plots in clustering_genes_03_20 folder
Following this paper’s guidelines:
Run PCA and keep enough components to explain 60% of the variance
Run ICA with that same number of nbComp as principal components kept to then filter them
Select components with kurtosis > 3
Assign genes to clusters with FDR<0.01 using the fdrtool package
ICA_output = datExpr_redDim %>% runICA(nbComp=ncol(datExpr_redDim), method='JADE')
signals_w_kurtosis = ICA_output$S %>% data.frame %>% dplyr::select(names(apply(ICA_output$S, 2, kurtosis)>3))
ICA_clusters = apply(signals_w_kurtosis, 2, function(x) fdrtool(x, plot=F)$qval<0.01) %>% data.frame
ICA_clusters = ICA_clusters[colSums(ICA_clusters)>1]
clusterings[['ICA_min']] = rep(NA, nrow(ICA_clusters))
ICA_clusters_names = colnames(ICA_clusters)
for(c in ICA_clusters_names) clusterings[['ICA_min']][ICA_clusters[,c]] = c
clusterings[['ICA_NA']] = is.na(clusterings[['ICA_min']])
Leaves most of the observations (~80%) without a cluster (every time it leaves out more genes):
ICA_clusters %>% rowSums %>% table
## .
## 0 1 2 3 4 5
## 2355 425 112 27 8 1
ICA_clusters %>% mutate(cl_sum=rowSums(.)) %>% as.matrix %>% melt %>% ggplot(aes(Var2, Var1)) +
geom_tile(aes(fill=value)) + xlab('Clusters') + ylab('Samples') +
theme_minimal() + theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) + coord_flip()
best_power=56 but blockwiseModules only accepts powers up to 30, so 30 was used instead
best_power = datExpr_redDim %>% t %>% pickSoftThreshold(powerVector = seq(20, 60, by=2))
## Power SFT.R.sq slope truncated.R.sq mean.k. median.k. max.k.
## 1 20 0.238 -0.225 0.940 516 517 1130
## 2 22 0.302 -0.248 0.933 478 463 1080
## 3 24 0.366 -0.272 0.942 446 417 1040
## 4 26 0.436 -0.293 0.958 417 376 998
## 5 28 0.485 -0.314 0.949 391 339 963
## 6 30 0.522 -0.330 0.945 368 309 931
## 7 32 0.578 -0.352 0.967 347 282 901
## 8 34 0.625 -0.371 0.965 328 258 873
## 9 36 0.645 -0.384 0.954 311 236 847
## 10 38 0.688 -0.402 0.946 295 215 822
## 11 40 0.715 -0.415 0.938 281 200 799
## 12 42 0.740 -0.426 0.935 267 185 778
## 13 44 0.763 -0.442 0.930 255 172 757
## 14 46 0.787 -0.457 0.936 244 159 738
## 15 48 0.802 -0.469 0.933 233 148 719
## 16 50 0.809 -0.480 0.923 223 138 702
## 17 52 0.823 -0.493 0.924 214 128 685
## 18 54 0.839 -0.503 0.929 206 119 669
## 19 56 0.854 -0.515 0.933 198 111 654
## 20 58 0.858 -0.527 0.926 190 104 640
## 21 60 0.872 -0.534 0.937 183 98 626
network = datExpr_redDim %>% t %>% blockwiseModules(power=30, numericLabels=TRUE)
clusterings[['WGCNA']] = network$colors
names(clusterings[['WGCNA']]) = rownames(datExpr_redDim)
It leaves 304 genes without a cluster:
clusterings[['WGCNA']] %>% table
## .
## 0 1 2 3 4
## 304 1742 528 227 127
Number of clusters that resemble more Gaussian mixtures = 39
n_clust = datExpr_redDim %>% Optimal_Clusters_GMM(max_clusters=50, criterion='BIC', plot_data=FALSE)
plot(n_clust, type='l', main='Bayesian Information Criterion to choose number of clusters')
best_k = 39
gmm = datExpr_redDim %>% GMM(best_k)
clusterings[['GMM']] = gmm$Log_likelihood %>% apply(1, which.max)
Plot of clusters with their centroids in gray
gmm_points = rbind(datExpr_redDim, setNames(data.frame(gmm$centroids), names(datExpr_redDim)))
gmm_labels = c(clusterings[['GMM']], rep(NA, best_k)) %>% as.factor
ggplotly(gmm_points %>% ggplot(aes(x=PC1, y=PC2, color=gmm_labels)) + geom_point() + theme_minimal())
Trying with 27 clusters (27 has the 9th lowest value)
best_k = 27
gmm = datExpr_redDim %>% GMM(best_k)
clusterings[['GMM_2']] = gmm$Log_likelihood %>% apply(1, which.max)
clusterings[['GMM_2']] %>% table
## .
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
## 159 71 159 165 109 122 96 106 135 78 54 109 154 90 146 71 102 40
## 19 20 21 22 23 24 25 26 27
## 160 159 99 87 55 98 82 67 155
Trying with 6 clusters
best_k = 6
gmm = datExpr_redDim %>% GMM(best_k)
clusterings[['GMM_3']] = gmm$Log_likelihood %>% apply(1, which.max)
clusterings[['GMM_3']] %>% table
## .
## 1 2 3 4 5 6
## 662 455 366 684 431 330
Separate the two clouds of points by a straight line. There seems to be a difference in the mean expression of the genes between clusters but not in their standard deviation.
# Modify equation sign if needed so blue cluter represents overexpressed genes
manual_clusters = as.factor(as.numeric(-0.1*datExpr_redDim$PC1 + 0.05 > datExpr_redDim$PC2))
datExpr_redDim %>% ggplot(aes(x=PC1, y=PC2, color=manual_clusters)) + geom_point() +
geom_abline(slope=-0.1, intercept=0.05, color='gray') + theme_minimal()
names(manual_clusters) = rownames(datExpr_redDim)
clusterings[['Manual']] = manual_clusters
clusterings[['Manual']] %>% table
## .
## 0 1
## 1713 1215
Cluster 2 has a bigger mean expression than cluster 1. There also seem to be more than one distributions in cluster 2, with two different means and three different standard deviations.
manual_clusters_data = cbind(apply(datExpr_redDim, 1, mean), apply(datExpr_redDim, 1, sd),
manual_clusters) %>% data.frame
colnames(manual_clusters_data) = c('mean','sd','cluster')
manual_clusters_data = manual_clusters_data %>% mutate('cluster'=as.factor(cluster))
p1 = manual_clusters_data %>% ggplot(aes(x=mean, color=cluster, fill=cluster)) +
geom_density(alpha=0.4) + theme_minimal()
p2 = manual_clusters_data %>% ggplot(aes(x=sd, color=cluster, fill=cluster)) +
geom_density(alpha=0.4) + theme_minimal()
grid.arrange(p1, p2, ncol=2)
Separating cluster 2 genes by their sd: It seems that the bigger the sd of the cluster the genes belong to, the bigger their (absolute) value is in the first PC component (blue samples are really close to 0, and the purple ones are between 10 and 20). The second PC also seems to play a small part on this.
gg_colour_hue = function(n) {
hues = seq(15, 375, length = n + 1)
hcl(h = hues, l = 65, c = 100)[1:n]
}
n_clusters = 3
c2_sd = manual_clusters_data %>% filter(cluster==2) %>% dplyr::select(sd)
rownames(c2_sd) = rownames(manual_clusters_data)[manual_clusters_data$cluster=='2']
gmm_c2_sd = c2_sd %>% GMM(n_clusters)
clusterings[['Manual_sd']] = as.character(clusterings[['Manual']])
clusterings[['Manual_sd']][clusterings[['Manual']]==1] = gmm_c2_sd$Log_likelihood %>%
apply(1, function(x) glue('1_',which.max(x)))
plot_gaussians = manual_clusters_data %>% ggplot(aes(x=sd)) +
stat_function(fun=dnorm, n=100, colour=gg_colour_hue(n_clusters+1)[2], # green
args=list(mean=gmm_c2_sd$centroids[1], sd=gmm_c2_sd$covariance_matrices[1])) +
stat_function(fun=dnorm, n=100, colour=gg_colour_hue(n_clusters+1)[3], # acqua
args=list(mean=gmm_c2_sd$centroids[2], sd=gmm_c2_sd$covariance_matrices[2])) +
stat_function(fun=dnorm, n=100, colour=gg_colour_hue(n_clusters+1)[4], # acqua
args=list(mean=gmm_c2_sd$centroids[3], sd=gmm_c2_sd$covariance_matrices[3])) +
theme_minimal()
plot_points = datExpr_redDim %>% ggplot(aes_string(x='PC1', y='PC2', color=as.factor(clusterings[['Manual_sd']]))) +
geom_point() + theme_minimal()
grid.arrange(plot_gaussians, plot_points, ncol=2)
clusterings[['Manual_sd']] %>% table
## .
## 0 1_1 1_2 1_3
## 1713 422 436 357
Separating cluster 2 genes by their mean:
n_clusters = 2
c2_mean = manual_clusters_data %>% filter(cluster==2) %>% dplyr::select(mean)
rownames(c2_mean) = rownames(manual_clusters_data)[manual_clusters_data$cluster=='2']
gmm_c2_mean= c2_mean %>% GMM(n_clusters)
clusterings[['Manual_mean']] = as.character(clusterings[['Manual']])
clusterings[['Manual_mean']][clusterings[['Manual']]==1] = gmm_c2_mean$Log_likelihood %>%
apply(1, function(x) glue('1_',which.max(x)))
plot_gaussians = manual_clusters_data %>% ggplot(aes(x=mean)) +
stat_function(fun=dnorm, n=100, colour=gg_colour_hue(n_clusters+1)[2], # green
args=list(mean=gmm_c2_mean$centroids[1], sd=gmm_c2_mean$covariance_matrices[1])) +
stat_function(fun=dnorm, n=100, colour=gg_colour_hue(n_clusters+1)[3], # blue
args=list(mean=gmm_c2_mean$centroids[2], sd=gmm_c2_mean$covariance_matrices[2])) +
theme_minimal()
plot_points = datExpr_redDim %>% ggplot(aes_string(x='PC1', y='PC2', color=as.factor(clusterings[['Manual_mean']]))) +
geom_point() + theme_minimal()
grid.arrange(plot_gaussians, plot_points, ncol=2)
clusterings[['Manual_mean']] %>% table
## .
## 0 1_1 1_2
## 1713 622 593
rm(wss, datExpr_k_means, h_clusts, cc_output, cc_output_c1, cc_output_c2, best_k, ICA_output,
ICA_clusters_names, signals_w_kurtosis, n_clust, gmm, gmm_points, gmm_labels, network,
best_power, c, manual_clusters, manual_clusters_data, c2_sd, c2_mean, gmm_c2_sd, gmm_c2_mean,
p1, p2, pca_data_projection, dend_meta, plot_gaussians, plot_points, n_clusters,
viridis_score_cols, gg_colour_hue, create_viridis_dict)
Using Adjusted Rand Index:
Clusterings seem to give very different results and none resembles the manual separation
Simple methods give similar results (K-means, hierarchical clustering, consensus clustering)
cluster_sim = data.frame(matrix(nrow = length(clusterings), ncol = length(clusterings)))
for(i in 1:(length(clusterings))){
cluster1 = as.factor(clusterings[[i]])
for(j in (i):length(clusterings)){
cluster2 = as.factor(clusterings[[j]])
cluster_sim[i,j] = adj.rand.index(cluster1, cluster2)
}
}
colnames(cluster_sim) = names(clusterings)
rownames(cluster_sim) = colnames(cluster_sim)
cluster_sim = cluster_sim %>% as.matrix %>% round(2)
heatmap.2(x = cluster_sim, Rowv = FALSE, Colv = FALSE, dendrogram = 'none',
cellnote = cluster_sim, notecol = 'black', trace = 'none', key = FALSE,
cexRow = 1, cexCol = 1, margins = c(7,7))
rm(i, j, cluster1, cluster2, cluster_sim)
create_3D_plot = function(cat_var){
plot_points %>% plot_ly(x=~PC1, y=~PC2, z=~PC3) %>% add_markers(color=plot_points[,cat_var], size=1) %>%
layout(title = glue('Samples coloured by ', cat_var),
scene = list(xaxis=list(title=glue('PC1 (',round(summary(pca_output)$importance[2,1]*100,2),'%)')),
yaxis=list(title=glue('PC2 (',round(summary(pca_output)$importance[2,2]*100,2),'%)')),
zaxis=list(title=glue('PC3 (',round(summary(pca_output)$importance[2,3]*100,2),'%)'))))
}
plot_points = datExpr_redDim %>% data.frame() %>% dplyr::select(PC1:PC3) %>%
mutate(ID = rownames(.), km_clust = as.factor(clusterings[['km']]),
hc_clust = as.factor(clusterings[['hc']]), cc_l1_clust = as.factor(clusterings[['cc_l1']]),
cc_clust = as.factor(clusterings[['cc_l2']]), ica_clust = as.factor(clusterings[['ICA_min']]),
n_ica_clust = as.factor(rowSums(ICA_clusters)), gmm_clust = as.factor(clusterings[['GMM']]),
gmm_clust2 = as.factor(clusterings[['GMM_2']]), gmm_clust3 = as.factor(clusterings[['GMM_3']]),
wgcna_clust = as.factor(clusterings[['WGCNA']]), manual_clust=as.factor(clusterings[['Manual']]),
manual2_clust = as.factor(clusterings[['Manual_mean']]),
manual3_clust = as.factor(clusterings[['Manual_sd']]),
SFARI_clust = as.factor(clusterings[['SFARI_score']]),
SFARI_bool = as.factor(clusterings[['SFARI_bool']]),
syndromic = as.factor(clusterings[['syndromic']]))
Simple methods seem to only partition the space in buckets using information from the first component
All clusterings except K-means manage to separate the two clouds at least partially, but no one does a good job
WGCNA creates clusters inverted between clouds
Manual classification + GMM by standard deviation clusters make sense
SFARI genes seem to be everywhere
selectable_scatter_plot(plot_points, plot_points)
create_3D_plot('ica_clust')
create_3D_plot('gmm_clust')
create_3D_plot('gmm_clust3')
create_3D_plot('wgcna_clust')
create_3D_plot('manual2_clust')